| Jobs | 1 |
| Input files |
- /2/scratch/keaton/plague-phylogeography/workflow/logs/iqtree/{reads_origin}/iqtree-core_{locus_name}.filter1.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/iqtree/{reads_origin}/iqtree-core_{locus_name}.filter2.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/iqtree/{reads_origin}/iqtree-core_{locus_name}.filter3.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/iqtree/{reads_origin}/iqtree-core_{locus_name}.filter4.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/iqtree/{reads_origin}/iqtree-core_{locus_name}.filter5.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter1.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter2.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter3.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter4.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter5.log
|
| Output files |
- /2/scratch/keaton/plague-phylogeography/results/iqtree/{reads_origin}/missing_data_{locus_name}.snps.html
|
| Code |
|
|
|
|
| import pandas as pd
import plotly.graph_objects as go
import os
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 | data = {
"missing_data" : [0,1,2,3,4,5],
"all_variants" : [0,52,396,1786,3653,6166],
"parsimony_variants" : [0,26,162,716,1479,2460],
}
# Identify reads_origin is from tree logs path
reads_origin_list = ["assembly", "sra", "local", "all"]
for origin in reads_origin_list:
if origin in snakemake.input.tree_log[0]:
reads_origin = origin
break
# Identify plot_thresholds from tree logs paths
plot_thresholds = []
for treelog in snakemake.input.tree_log:
threshold = [item.replace("filter","") for item in treelog.split(".") if "filter" in item][0]
plot_thresholds.append(threshold)
print(plot_thresholds)
# Identify the locus from the last tree log
target_locus = [item.replace("iqtree-core_","").split(".")[0] for item in treelog.split("/") if "iqtree-core" in item][0]
print(target_locus)
num_samples = 0
num_sites = 0
num_samples_perc50_ambig = 0
# Uncomment lines if running outside of snakemake
#notebooks_dir = os.getcwd()
#workflow_dir = os.path.dirname(notebooks_dir)
#project_dir = os.path.dirname(workflow_dir)
project_dir = os.getcwd()
output_dir = os.path.join(project_dir, "results", "iqtree", reads_origin)
iqtree_dir = output_dir
logs_dir = os.path.join(project_dir, "workflow", "logs", "snippy_multi", reads_origin)
data = {
"missing_data" : [],
"all_variants" : [],
"parsimony_variants" : [],
"perc50_ambig" : [],
}
|
|
| ## Parse Data - Snakemake
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40 | for threshold in plot_thresholds:
# search for all variants in filter_snp logs
snippy_core_file = "snippy-core_chromosome.snps.filter{}.log".format(threshold)
snippy_core_term = "Wrote a multi-fasta alignment of length:"
num_sites_term = "Alignment length: "
num_samples_term = "Number of samples: "
all_variants = 0
for file in snakemake.input.filter_snp_log:
if snippy_core_file in file:
data["missing_data"].append(threshold)
with open(file, "r") as logfile:
for line in logfile:
if snippy_core_term in line:
all_variants = line.split(": ")[-1].strip()
if num_samples_term in line:
num_samples = line.split(num_samples_term)[1]
if num_sites_term in line:
num_sites = line.split(num_sites_term)[1]
data["all_variants"].append(all_variants)
# search for parsimony variants in tree logs
iqtree_file = "iqtree-core_chromosome.filter{}.log".format(threshold)
iqtree_term = "parsimony-informative"
perc50_ambig_term = "gaps/ambiguity"
parsimony_variants = 0
perc50_ambig = 0
for file in snakemake.input.tree_log:
if iqtree_file in file:
with open(file, "r") as logfile:
for line in logfile:
if iqtree_term in line:
parsimony_variants = line.split(" parsimony-informative")[0].strip()
if perc50_ambig_term in line:
perc50_ambig = line.split(" ")[1]
data["parsimony_variants"].append(parsimony_variants)
data["perc50_ambig"].append(perc50_ambig)
print(data)
print("num_samples: ", num_samples)
print("num_sites: ", num_sites)
|
|
|
|
|
|
|
|
| ### Add Data - All Variants
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | fig.add_trace(
go.Scatter(
x= data["missing_data"],
y = data["all_variants"],
mode='lines+markers',
name = "All Variants",
marker=dict(
#color='LightSkyBlue',
size=20,
line=dict(
color='DarkSlateGrey',
width=2,
)
),
line=dict(width=5),
)
)
|
|
| ### Add Data - Parsimony Informative Variants
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | fig.add_trace(
go.Scatter(
x= data["missing_data"],
y = data["parsimony_variants"],
mode='lines+markers',
name = "Parsimony Informative Variants",
marker=dict(
#color='LightSkyBlue',
size=20,
line=dict(
color='DarkSlateGrey',
width=2,
)
),
line=dict(width=5),
)
)
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 | fig.update_layout(
template="simple_white",
width=1080,
height=720,
title=("<b>Variants Across Missing Data Site Thresholds <br> (Samples: "
+ str(num_samples)
+ ", Sites: "
+ str(num_sites)
+ ") </b>"),
title_x = 0.5,
xaxis = dict(
title = "Missing Data Threshold Per Site (%)",
tickvals = plot_thresholds,
),
yaxis_title = "Number of Variant Sites",
)
|
|
|
|
|
|
|
|
| #output_plot = os.path.join(output_dir, "missing_data.html")
output_plot = snakemake.output.plot
fig.write_html(output_plot)
|
|
|
|